@article{2024_dolma,
        title = {{Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research}},
        author={
            Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk
            and David Atkinson and Russell Authur and Ben Bogin and Khyathi Chandu
            and Jennifer Dumas and Yanai Elazar and Valentin Hofmann
            and Ananya Harsh Jha and Sachin Kumar and Li Lucy and Xinxi Lyu
            and Nathan Lambert and Ian Magnusson and Jacob Morrison
            and Niklas Muennighoff and Aakanksha Naik and Crystal Nam
            and Matthew E. Peters and Abhilasha Ravichander and Kyle Richardson
            and Zejiang Shen and Emma Strubell and Nishant Subramani
            and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer
            and Noah A. Smith and Hannaneh Hajishirzi and Iz Beltagy
            and Dirk Groeneveld and Jesse Dodge and Kyle Lo
        },
    year = {2024},
    journal={arXiv preprint},
}

@misc{2024_fineweb_edu,
    title = {FineWeb-Edu},
    author = {Lozhkov, Anton and Ben Allal, Loubna and von Werra,
    Leandro and Wolf, Thomas},
    month = {May},
    year = {2024},
    doi = { 10.57967/hf/2497 },
    url = {https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu}
}

@article{2020_thepile,
    title={The Pile: An 800{GB} dataset of diverse text for language modeling},
    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding,
        Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He,
        Horace and Thite, Anish and Nabeshima, Noa and others},
    journal={arXiv preprint arXiv:2101.00027},
    year={2020}
}

@article{2022_thepile_datasheet,
    title={Datasheet for the pile},
    author={Biderman, Stella and Bicheno, Kieran and Gao, Leo},
    journal={arXiv preprint arXiv:2201.07311},
    year={2022}
}

@misc{2023_redpajama,
    title = {RedPajama: An Open Source Recipe to Reproduce LLaMA training dataset},
    author = {Together Computer},
    month = {April},
    year = {2023},
    url = {https://github.com/togethercomputer/RedPajama-Data}
}

@misc{2023_slimpajama,
    title = {{SlimPajama: A 627B token cleaned and deduplicated version of RedPajama}},
    author = {Soboleva, Daria and Al-Khateeb, Faisal and Myers, Robert and Steeves, Jacob R and Hestness, Joel and Dey, Nolan},
    month = {June},
    year = {2023},
    howpublished = {\url{https://cerebras.ai/blog/slimpajama-a-627b-token-cleaned-and-deduplicated-version-of-redpajama}},
    url = {https://huggingface.co/datasets/cerebras/SlimPajama-627B},
}


@misc{2016_pointer_sentinel,
    title={Pointer Sentinel Mixture Models},
    author={Stephen Merity and Caiming Xiong
        and James Bradbury and Richard Socher},
    year={2016},
    eprint={1609.07843},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
